## Causal Inference Analysis on the effect of Eigenvector centrality of the language 
## on the number of famous people speaking that language

### Alternative models -- Descritizing the centrality.
#WikiFame$twitter.is.EVcentral <- NULL

twitter.is.EVcentral <- WikiFame$twitter.eigenvector>0.01
twitter.is.EVcentral <- factor(twitter.is.EVcentral,labels=c("not central","central"))
WikiFame <- cbind(WikiFame,twitter.is.EVcentral)

wikipedia.is.EVcentral <- WikiFame$wikipedia.eigenvector>0.01
wikipedia.is.EVcentral <- factor(wikipedia.is.EVcentral,labels=c("not central","central"))
WikiFame <- cbind(WikiFame,wikipedia.is.EVcentral)

books.is.EVcentral <- WikiFame$books.eigenvector>0.01
books.is.EVcentral <- factor(books.is.EVcentral,labels=c("not central","central"))
WikiFame <- cbind(WikiFame,books.is.EVcentral)

## Modified models
l5.mod <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ twitter.is.EVcentral, data=WikiFame)
l6.mod <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ wikipedia.is.EVcentral, data=WikiFame)
l7.mod <- lm(formula = log10(exports_1800_1950) ~ log10(actual_speakers_m) + log10(gdp_pc)+ books.is.EVcentral, data=WikiFame)

## How correlated were the previous models within themselves?
plot(l5$fitted.values,l6$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l6$fitted.values~l5$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5$fitted.values,l6$fitted.value)

plot(l5$fitted.values,l7$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7$fitted.values~l5$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5$fitted.values,l7$fitted.value)

plot(l6$fitted.values,l7$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7$fitted.values~l6$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l6$fitted.values,l7$fitted.value)

## How correlated are the new models to the corresponding previous models?
plot(l5.mod$fitted.values,l5$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l5$fitted.values~l5.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5.mod$fitted.values,l5$fitted.value)

plot(l6.mod$fitted.values,l6$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l6$fitted.values~l6.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l6.mod$fitted.values,l6$fitted.value)

plot(l7.mod$fitted.values,l7$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7$fitted.values~l7.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l7.mod$fitted.values,l7$fitted.value)

## How correlated are the modified models within themselves? -- In-Sample Fit Chart
plot(l5.mod$fitted.values,l6.mod$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l6.mod$fitted.values~l5.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5.mod$fitted.values,l6.mod$fitted.value)

plot(l5.mod$fitted.values,l7.mod$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7.mod$fitted.values~l5.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5.mod$fitted.values,l7.mod$fitted.value)

plot(l6.mod$fitted.values,l7.mod$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7.mod$fitted.values~l6.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l6.mod$fitted.values,l7.mod$fitted.value)

# Convex Hull -- Checking for extrapolation bias
library(Zelig)
library(WhatIf)
# Twitter
## Hull 1: Switch 1 to 0
a<-whatif(~log10(actual_speakers_m) + log10(gdp_pc)+ twitter.is.EVcentral, 
          data=WikiFame[WikiFame$twitter.is.EVcentral=="not central",], 
          cfact=WikiFame[WikiFame$twitter.is.EVcentral=="central",]) 
summary(a)

## Hull 2: Switch 0 to 1
b<-whatif(~log10(actual_speakers_m) + log10(gdp_pc)+ twitter.is.EVcentral,  
          data=WikiFame[WikiFame$twitter.is.EVcentral=="central",], 
          cfact=WikiFame[WikiFame$twitter.is.EVcentral=="not central",]) 
summary(b)

# Wikipedia
## Hull 1: Switch 1 to 0
a<-whatif(~log10(actual_speakers_m) + log10(gdp_pc)+ wikipedia.is.EVcentral, 
          data=WikiFame[WikiFame$wikipedia.is.EVcentral=="not central",], 
          cfact=WikiFame[WikiFame$wikipedia.is.EVcentral=="central",]) 
summary(a)

## Hull 2: Switch 0 to 1
b<-whatif(~log10(actual_speakers_m) + log10(gdp_pc)+ books.is.EVcentral,  
          data=WikiFame[WikiFame$books.is.EVcentral=="central",], 
          cfact=WikiFame[WikiFame$books.is.EVcentral=="not central",]) 
summary(b)

# Books
## Hull 1: Switch 1 to 0
a<-whatif(~log10(actual_speakers_m) + log10(gdp_pc)+ books.is.EVcentral, 
          data=WikiFame[WikiFame$books.is.EVcentral=="not central",], 
          cfact=WikiFame[WikiFame$books.is.EVcentral=="central",]) 
summary(a)

## Hull 2: Switch 0 to 1
b<-whatif(~log10(actual_speakers_m) + log10(gdp_pc)+ books.is.EVcentral,  
          data=WikiFame[WikiFame$books.is.EVcentral=="central",], 
          cfact=WikiFame[WikiFame$books.is.EVcentral=="not central",]) 
summary(b)



# Measuring imbalance
library(cem)
imbalance.t <- imbalance(group=WikiFame$twitter.is.EVcentral, data=WikiFame[,c(2,3)])
imbalance.t
imbalance.w <- imbalance(group=WikiFame$wikipedia.is.EVcentral, data=WikiFame[,c(2,3)])
imbalance.w
imbalance.b <- imbalance(group=WikiFame$books.is.EVcentral, data=WikiFame[,c(2,3)])
imbalance.b

# Matching
library(rgenoud)
library(Matching)
library(MatchIt)

### Twitter
## Propensity Score Matching
logit<-glm(twitter.is.EVcentral~log10(actual_speakers_m) + log10(gdp_pc),
           family=binomial(link="logit"), data=WikiFame)
summary(logit)
t.fitted<-logit$fitted.values
WikiFame <- cbind(WikiFame,t.fitted)

par(mfrow = c(1,2))
hist(t.fitted[WikiFame$twitter.is.EVcentral=="central"],  
     xlab="Propensity Score", main="Treatment",5)
hist(t.fitted[WikiFame$twitter.is.EVcentral=="not central"],  
     xlab="Propensity Score", main="Control",5)
dev.off()

#m<-Match(Y=log10(WikiFame$exports_1800_1950),Tr=WikiFame$twitter.is.EVcentral=="central",X=WikiFame$t.fitted,
#        estimand="ATT", BiasAdjust=TRUE, Weight=2, M=1)
#summary(m)

# OR
m.out<-matchit(twitter.is.EVcentral=="central"~log10(actual_speakers_m) + log10(gdp_pc)+log10(gdp_pc)*log10(actual_speakers_m), 
               data=WikiFame, distance="logit")

imbalance.t <- imbalance(group=match.data(m.out)$twitter.is.EVcentral, data=match.data(m.out)[,c(2,3)])
imbalance.t
plot(m.out)

matched.t <- match.data(m.out)
treated.mean.t <- mean(matched.t$exports_1800_1950[matched.t$twitter.is.EVcentral=="central"])
controlled.mean.t <- mean(matched.t$exports_1800_1950[matched.t$twitter.is.EVcentral=="not central"])

treated.mean.t - controlled.mean.t

treated.prob.t <- treated.mean.t/mean(matched.t$actual_speakers_m[matched.t$twitter.is.EVcentral=="central"])
controlled.prob.t <- controlled.mean.t/mean(matched.t$actual_speakers_m[matched.t$twitter.is.EVcentral=="not central"])

(treated.prob.t - controlled.prob.t)/controlled.prob.t

### Wikipedia
## Using CEM 
cem.match <- cem(treatment="wikipedia.is.EVcentral",
                 data=WikiFame[,c(2,3,9)])
data_cem3 <- WikiFame[cem.match$matched ,]
post.imbalance.cem3 <- imbalance(group=data_cem3$wikipedia.is.EVcentral, data=data_cem3[,c(2,3)])
post.imbalance.cem3

matched.w <- data_cem3
treated.mean.w <- mean(matched.w$exports_1800_1950[matched.w$wikipedia.is.EVcentral=="central"])
controlled.mean.w <- mean(matched.w$exports_1800_1950[matched.w$wikipedia.is.EVcentral=="not central"])

treated.mean.w - controlled.mean.w

treated.prob.w <- treated.mean.w/mean(matched.w$actual_speakers_m[matched.w$wikipedia.is.EVcentral=="central"])
controlled.prob.w <- controlled.mean.w/mean(matched.w$actual_speakers_m[matched.w$wikipedia.is.EVcentral=="not central"])

(treated.prob.w - controlled.prob.w)/controlled.prob.w



# For books
## Using CEM 
cem.match <- cem(treatment="books.is.EVcentral",
                 data=WikiFame[,c(2,3,10)])
data_cem3 <- WikiFame[cem.match$matched ,]
post.imbalance.cem3 <- imbalance(group=data_cem3$books.is.EVcentral, data=data_cem3[,c(2,3)])
post.imbalance.cem3

matched.b <- data_cem3
treated.mean.b <- mean(matched.b$exports_1800_1950[matched.b$books.is.EVcentral=="central"])
controlled.mean.b <- mean(matched.b$exports_1800_1950[matched.b$books.is.EVcentral=="not central"])

treated.mean.b - controlled.mean.b

treated.prob.b <- treated.mean.b/mean(matched.b$actual_speakers_m[matched.b$books.is.EVcentral=="central"])
controlled.prob.b <- controlled.mean.b/mean(matched.b$actual_speakers_m[matched.b$books.is.EVcentral=="not central"])

(treated.prob.b - controlled.prob.b)/controlled.prob.b





